Due by midnight on Tuesday November 2, 2021. Answer all of the following problems. These problems should be completed in this notebook (using the R kernel). Computational questions may require code, plots, analysis, interpretation, etc. Working in small groups is allowed, but it is important that you make an effort to master the material and hand in your own work.
library(faraway)
data(teengamb)
head(teengamb)
## sex status income verbal gamble
## 1 1 51 2.00 8 0.0
## 2 1 28 2.50 8 0.0
## 3 1 37 2.00 6 0.0
## 4 1 28 7.00 4 7.3
## 5 1 65 2.00 8 19.6
## 6 1 61 3.47 6 0.1
set.seed(123)
lmod = lm(gamble ~ sex + status + income + verbal, data = teengamb)
summary(lmod)
##
## Call:
## lm(formula = gamble ~ sex + status + income + verbal, data = teengamb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.082 -11.320 -1.451 9.452 94.252
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.55565 17.19680 1.312 0.1968
## sex -22.11833 8.21111 -2.694 0.0101 *
## status 0.05223 0.28111 0.186 0.8535
## income 4.96198 1.02539 4.839 1.79e-05 ***
## verbal -2.95949 2.17215 -1.362 0.1803
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.69 on 42 degrees of freedom
## Multiple R-squared: 0.5267, Adjusted R-squared: 0.4816
## F-statistic: 11.69 on 4 and 42 DF, p-value: 1.815e-06
plot(lmod)
print("There does not seem to be any violations of linearuty or normality, however there might hint of non constant variance due to points 24 & 39. Based on Cook's distance, point 24 needs further investigation.")
## [1] "There does not seem to be any violations of linearuty or normality, however there might hint of non constant variance due to points 24 & 39. Based on Cook's distance, point 24 needs further investigation."
This link contains advertising data. This dataset contains, in thousands of dollars, TV, Radio, and Newspaper budgets for 200 different markets along with the Sales, in thousands of units, for each market.
adv <- read.csv("https://www.colorado.edu/amath/sites/default/files/attached-files/advertising.txt", sep = "")
data = read.table("https://www.colorado.edu/amath/sites/default/files/attached-files/advertising.txt")
head(data)
## TV radio newspaper sales
## 1 230.1 37.8 69.2 22.1
## 2 44.5 39.3 45.1 10.4
## 3 17.2 45.9 69.3 9.3
## 4 151.5 41.3 58.5 18.5
## 5 180.8 10.8 58.4 12.9
## 6 8.7 48.9 75.0 7.2
slr = lm(sales ~ radio, data = data)
summary(slr)
##
## Call:
## lm(formula = sales ~ radio, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.7305 -2.1324 0.7707 2.7775 8.1810
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.31164 0.56290 16.542 <2e-16 ***
## radio 0.20250 0.02041 9.921 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.275 on 198 degrees of freedom
## Multiple R-squared: 0.332, Adjusted R-squared: 0.3287
## F-statistic: 98.42 on 1 and 198 DF, p-value: < 2.2e-16
plot(slr)
print("Based on the rasidual vs. fitted plot there seems to be a violation of non-constant variance. An based on the Q-Q plot, there might be an indication of a viloation of normality. There does not seem to be any observable violation of linearlity.")
## [1] "Based on the rasidual vs. fitted plot there seems to be a violation of non-constant variance. An based on the Q-Q plot, there might be an indication of a viloation of normality. There does not seem to be any observable violation of linearlity."
slr2 = lm(sales ~ TV, data = data)
summary(slr2)
##
## Call:
## lm(formula = sales ~ TV, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3860 -1.9545 -0.1913 2.0671 7.2124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.032594 0.457843 15.36 <2e-16 ***
## TV 0.047537 0.002691 17.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.259 on 198 degrees of freedom
## Multiple R-squared: 0.6119, Adjusted R-squared: 0.6099
## F-statistic: 312.1 on 1 and 198 DF, p-value: < 2.2e-16
plot(slr2)
print("Based on the rasidual vs. fitted plot there seems to be a violation of non-constant variance. There does not seem to be any observable violation of linearlity or normality. There is no observable relationship between the two variables.")
## [1] "Based on the rasidual vs. fitted plot there seems to be a violation of non-constant variance. There does not seem to be any observable violation of linearlity or normality. There is no observable relationship between the two variables."
slr3 = lm(sales ~ newspaper, data = data)
summary(slr3)
##
## Call:
## lm(formula = sales ~ newspaper, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.2272 -3.3873 -0.8392 3.5059 12.7751
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.35141 0.62142 19.88 < 2e-16 ***
## newspaper 0.05469 0.01658 3.30 0.00115 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.092 on 198 degrees of freedom
## Multiple R-squared: 0.05212, Adjusted R-squared: 0.04733
## F-statistic: 10.89 on 1 and 198 DF, p-value: 0.001148
plot(slr3)
print("There does not seem to be any observable violations. Though, there might be a correlation between the two variables.")
## [1] "There does not seem to be any observable violations. Though, there might be a correlation between the two variables."
mlr = lm(sales ~ radio + TV, data = data)
summary(mlr)
##
## Call:
## lm(formula = sales ~ radio + TV, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7977 -0.8752 0.2422 1.1708 2.8328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.92110 0.29449 9.919 <2e-16 ***
## radio 0.18799 0.00804 23.382 <2e-16 ***
## TV 0.04575 0.00139 32.909 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.681 on 197 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962
## F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16
plot(mlr)
print("The mlr seem to have fixed the non-constant variance issue, though normality still seems a bit off.")
## [1] "The mlr seem to have fixed the non-constant variance issue, though normality still seems a bit off."
mlr2 = lm(sales ~ radio + I((TV)^2), data = data)
summary(mlr)
##
## Call:
## lm(formula = sales ~ radio + TV, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7977 -0.8752 0.2422 1.1708 2.8328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.92110 0.29449 9.919 <2e-16 ***
## radio 0.18799 0.00804 23.382 <2e-16 ***
## TV 0.04575 0.00139 32.909 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.681 on 197 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962
## F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16
plot(mlr)
print("There does not seem to be any observable difference.")
## [1] "There does not seem to be any observable difference."
Researchers at the National Institutes of Standards and Technology (NIST) collected pipline data on ultrasonic measurements of the depth of defects in the Alaska pipeline in the field. The depths of the defects were then remeasured in the laboratory. The laboratory measurements are more accurate than the field measurements, but more time consuming and expensive. We want to develop a regression model for correcting the in field measurements.
data(pipeline)
head(pipeline)
## Field Lab Batch
## 1 18 20.2 1
## 2 38 56.0 1
## 3 15 12.5 1
## 4 20 21.2 1
## 5 18 15.5 1
## 6 36 39.0 1
lmod = lm(Lab ~ Field, data = pipeline)
summary(lmod)
##
## Call:
## lm(formula = Lab ~ Field, data = pipeline)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.985 -4.072 -1.431 2.504 24.334
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.96750 1.57479 -1.249 0.214
## Field 1.22297 0.04107 29.778 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.865 on 105 degrees of freedom
## Multiple R-squared: 0.8941, Adjusted R-squared: 0.8931
## F-statistic: 886.7 on 1 and 105 DF, p-value: < 2.2e-16
plot(lmod)
print("There is a violation of non-constant variance.")
## [1] "There is a violation of non-constant variance."
lmod2 = lm(Lab ~ I(sqrt(Field)) , data = pipeline)
summary(lmod2)
##
## Call:
## lm(formula = Lab ~ I(sqrt(Field)), data = pipeline)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.706 -5.843 -1.343 5.538 22.652
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36.1385 2.8573 -12.65 <2e-16 ***
## I(sqrt(Field)) 13.5486 0.4931 27.48 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.446 on 105 degrees of freedom
## Multiple R-squared: 0.8779, Adjusted R-squared: 0.8767
## F-statistic: 755 on 1 and 105 DF, p-value: < 2.2e-16
plot(lmod2)
i = order(pipeline$Field); #Returning a permutation which rearranges its first argument into ascending or descending order, breaking ties by further arguments.
npipe = pipeline[i,]; #Setting the ordered permutations into a df
ff = gl(12,9)[-108]; #Generating factors by specifying the pattern of their levels.
meanfield = unlist(lapply(split(npipe$Field,ff),mean)) #Converting a list to vector
varlab = unlist(lapply(split(npipe$Lab,ff),var)) #Simplifying to produce a vector by preserving all components
plot(log10(varlab) ,log10(meanfield))
df<-data.frame(cbind(as.numeric(meanfield),as.numeric(varlab)))
df <- df[-c(12),]
lm.var.model <- lm(log(varlab) ~ log(meanfield) , data= df)
summary(lm.var.model)
##
## Call:
## lm(formula = log(varlab) ~ log(meanfield), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2038 -0.6729 0.1656 0.7205 1.1891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3538 1.5715 -0.225 0.8264
## log(meanfield) 1.1244 0.4617 2.435 0.0351 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.018 on 10 degrees of freedom
## Multiple R-squared: 0.3723, Adjusted R-squared: 0.3095
## F-statistic: 5.931 on 1 and 10 DF, p-value: 0.03513
pipeline<- pipeline[with(pipeline, order(Field)), ]
a0 <-10^summary(lm.var.model)$coefficients[1]
a1 <-summary(lm.var.model)$coefficients[2]
var.lab <- a0 * pipeline$Field^a1
se.lab <- sqrt(var.lab)
summary(se.lab)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.645 3.379 4.911 4.603 5.761 8.088
plot(se.lab)
print("Violation of normality with a low R62, thus not a good fit!")
## [1] "Violation of normality with a low R62, thus not a good fit!"